# Written by Zachary Frankel December 2009
# For Biophysics 101 at Harvard College
# This file contains methods for reading in CSV files with 
# genotype and phenotype data

import csv
import sys
import math
import ols
from numpy import *
from numpy.random import randn
# stores for each genotype the total number of datapoint for it
totaldic = {}

# a list of dictionaries where each phenotype has a dictionary 
# for which genotypes are mapped to how many datapoints they have
datalist = []

# number of SNPs
snpnum = 0

# the conditional probability of a phenotype given a genotype

def cond_prob(phenotype, genotype):
    if datalist[phenotype].has_key(genotype):
        return  1.0 * datalist[phenotype][genotype]/totaldic[genotype]
    else:
        return 0

# load data from a csv file with a specified number of phenotypes
# store totaldic and datalist so probabilities can be calculated
# also deletes any old stored info

def load_data(filename, numphenotypes):
    # create a reader object to go throug the csv
    dataread = csv.reader(open(filename, "rb"))

    # store the rownumber
    rownum = 0;

    counter = 0

    # fill datalist with a dictionary for each phenotype
    while counter < numphenotypes:
        datalist.append({})
        counter += 1

    # read in data from rows
    for row in dataread:      
        
        # store the header as the first row
        if rownum == 0:
            header = row

        # for all other rows go through columns and store data    
        else:
            column = 0
            stringkey = "";

            # define the key by the genotypes concatenated
            for col in row:
                stringkey += col;
                column += 1
                
            # adds to the genotype keys in the phenotype's dict
            if datalist[int(col)].has_key(stringkey[:-1]):
                datalist[int(col)][stringkey[:-1]]+=1
            else:
                datalist[int(col)][stringkey[:-1]]=1;

            # adds to the phenotype key mapping in total data
            if totaldic.has_key(stringkey[:-1]):
                totaldic[stringkey[:-1]] += 1
            else:
                totaldic[stringkey[:-1]] = 1
        rownum += 1
    snpnum = len(stringkey) - 1 
    return [datalist, totaldic]

def str_to_arr(phenotype):
    genotype_array = array([])
    i = 0
    while i < len(phenotype):
        genotype_array = append(genotype_array, int(phenotype[i:i+1]))
        i += 1
    return genotype_array
                               
    

